In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.offline as pyo 
import plotly.io as pio

from sklearn.preprocessing import StandardScaler
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
In [2]:
df= pd.read_csv('D:/Global DPI.csv')
In [3]:
df.head()
Out[3]:
Country AveragScore SafetySecurity PersonelFreedom Governance SocialCapital InvestmentEnvironment EnterpriseConditions MarketAccessInfrastructure EconomicQuality LivingConditions Health Education NaturalEnvironment
0 Denmark 84.55 92.59 94.09 89.45 82.56 82.42 79.64 78.79 76.81 95.77 81.07 87.48 73.94
1 Sweden 83.67 90.97 91.90 86.41 78.29 82.81 75.54 79.67 76.18 95.33 82.28 85.92 78.74
2 Norway 83.59 93.30 94.10 89.66 79.03 82.24 75.95 75.87 77.25 94.70 82.98 85.68 72.37
3 Finland 83.47 89.56 91.96 90.41 77.27 84.12 77.25 78.77 70.28 94.46 81.19 88.38 77.99
4 Switzerland 83.42 95.66 87.50 87.67 69.14 80.81 83.84 78.65 79.71 94.66 82.11 87.72 73.60
In [4]:
df.isnull().sum()
Out[4]:
Country                       0
AveragScore                   0
SafetySecurity                0
PersonelFreedom               0
Governance                    0
SocialCapital                 0
InvestmentEnvironment         0
EnterpriseConditions          0
MarketAccessInfrastructure    0
EconomicQuality               0
LivingConditions              0
Health                        0
Education                     0
NaturalEnvironment            0
dtype: int64
In [5]:
df.describe()
df.info()
df.duplicated().sum()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167 entries, 0 to 166
Data columns (total 14 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Country                     167 non-null    object 
 1   AveragScore                 167 non-null    float64
 2   SafetySecurity              167 non-null    float64
 3   PersonelFreedom             167 non-null    float64
 4   Governance                  167 non-null    float64
 5   SocialCapital               167 non-null    float64
 6   InvestmentEnvironment       167 non-null    float64
 7   EnterpriseConditions        167 non-null    float64
 8   MarketAccessInfrastructure  167 non-null    float64
 9   EconomicQuality             167 non-null    float64
 10  LivingConditions            167 non-null    float64
 11  Health                      167 non-null    float64
 12  Education                   167 non-null    float64
 13  NaturalEnvironment          167 non-null    float64
dtypes: float64(13), object(1)
memory usage: 18.4+ KB
Out[5]:
0
In [6]:
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df.drop("Country", axis=1)), columns=df.drop("Country", axis=1).columns)
In [7]:
plt.figure(figsize=(10, 8))
corr = df.drop("Country", axis=1).corr()
sns.heatmap(corr, annot=True, cmap='bwr', fmt=".2f")
plt.title("Correlation Matrix")
plt.show()
In [8]:
cols = ['AveragScore', 'SafetySecurity', 'PersonelFreedom',
       'Governance', 'SocialCapital', 'InvestmentEnvironment',
       'EnterpriseConditions', 'MarketAccessInfrastructure', 'EconomicQuality',
       'LivingConditions', 'Health', 'Education', 'NaturalEnvironment']
In [9]:
df.AveragScore.describe()
Out[9]:
count    167.000000
mean      58.056228
std       13.309964
min       30.400000
25%       47.770000
50%       57.530000
75%       66.860000
max       84.550000
Name: AveragScore, dtype: float64
In [11]:
ax = sns.boxplot(data=df['AveragScore'], orient='v', width=0.2)
ax.figure.set_size_inches(6, 5)
ax.set_title('AvgScore', fontsize=18)
ax.set_xlabel('AvgScore', fontsize=10)
plt.show()
In [12]:
import plotly.express as px

for i in cols:
    
    if i == 'AveragScore':
        top_10 = pd.DataFrame(df.groupby('Country')[['Country','AveragScore']].sum().sort_values('AveragScore', ascending=False).round(2).head(10))
        fig = px.bar(top_10, x = top_10.index, y = 'AveragScore',
            title = 'Top 10 Countries by AveragScore', template = 'seaborn', color = top_10.index, text = 'AveragScore')
        fig.show()
        top_10
        
    else:
        top_10 = pd.DataFrame(df.groupby('Country')[['Country',i]].sum().sort_values(i, ascending=False).round(2).head(10))
        fig = px.bar(top_10, x = top_10.index, y = i,
            title = 'Top 10 Countries by '+ i, template = 'seaborn', color = top_10.index, text = i)
        fig.show()
        top_10      
In [13]:
for i in cols:
    
    if i == 'AveragScore':
        char_bar = df.groupby(['Country'])[['AveragScore']].sum().reset_index()
        char_bar = char_bar.sort_values(by=("AveragScore"), ascending=True)

        top = char_bar.head(10)
        fig = go.Figure()
        fig.add_trace(go.Bar(x=top['Country'], y=top["AveragScore"]))

        fig.update_layout(title='Lowest Countries According to AveragScore',
                          xaxis_title='Country',
                          yaxis_title= "AveragScore",
                          plot_bgcolor='#F0EEED', 
                          paper_bgcolor='#F0EEED',  
                          font=dict(color='black'))

        pyo.init_notebook_mode(connected=True)
        pyo.iplot(fig)
        
    else:
        char_bar = df.groupby(['Country'])[[i]].sum().reset_index()
        char_bar = char_bar.sort_values(by=(i), ascending=True)

        top = char_bar.head(10)
        fig = go.Figure()
        fig.add_trace(go.Bar(x=top['Country'], y=top[i]))

        fig.update_layout(title='Lowest Countries According to '+ i,
                          xaxis_title='Country',
                          yaxis_title= i,
                          plot_bgcolor='#F0EEED', 
                          paper_bgcolor='#F0EEED',  
                          font=dict(color='black'))

        pyo.init_notebook_mode(connected=True)
        pyo.iplot(fig) 
In [14]:
import plotly.express as px

for i in cols:
    
    if i == 'AveragScore':
        fig = px.choropleth(df,locations='Country', color = i, locationmode='country names',title = f'{i} - Choropleth',color_continuous_scale='Viridis_r')
        fig.show()
        fig.write_html(f"geo-{i}.html")
    else:
        fig = px.choropleth(df,locations='Country', color = i, locationmode='country names',title = f'{i} - Choropleth',color_continuous_scale='Viridis_r')
        fig.show()
        fig.write_html(f"geo-{i}.html")
In [15]:
## K MEANS

df = df.set_index('Country')
In [19]:
!pip install yellowbrick
Collecting yellowbrick
  Obtaining dependency information for yellowbrick from https://files.pythonhosted.org/packages/06/35/c7d44bb541c06bc41b3239b27af79ea0ecc7dbb156ee1335576f99c58b91/yellowbrick-1.5-py3-none-any.whl.metadata
  Downloading yellowbrick-1.5-py3-none-any.whl.metadata (7.7 kB)
Requirement already satisfied: matplotlib!=3.0.0,>=2.0.2 in c:\users\jatin\anaconda3\lib\site-packages (from yellowbrick) (3.7.2)
Requirement already satisfied: scipy>=1.0.0 in c:\users\jatin\anaconda3\lib\site-packages (from yellowbrick) (1.11.1)
Requirement already satisfied: scikit-learn>=1.0.0 in c:\users\jatin\anaconda3\lib\site-packages (from yellowbrick) (1.3.0)
Requirement already satisfied: numpy>=1.16.0 in c:\users\jatin\anaconda3\lib\site-packages (from yellowbrick) (1.24.3)
Requirement already satisfied: cycler>=0.10.0 in c:\users\jatin\anaconda3\lib\site-packages (from yellowbrick) (0.11.0)
Requirement already satisfied: contourpy>=1.0.1 in c:\users\jatin\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (1.0.5)
Requirement already satisfied: fonttools>=4.22.0 in c:\users\jatin\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (4.25.0)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\jatin\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (1.4.4)
Requirement already satisfied: packaging>=20.0 in c:\users\jatin\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (23.1)
Requirement already satisfied: pillow>=6.2.0 in c:\users\jatin\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (9.4.0)
Requirement already satisfied: pyparsing<3.1,>=2.3.1 in c:\users\jatin\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (3.0.9)
Requirement already satisfied: python-dateutil>=2.7 in c:\users\jatin\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (2.8.2)
Requirement already satisfied: joblib>=1.1.1 in c:\users\jatin\anaconda3\lib\site-packages (from scikit-learn>=1.0.0->yellowbrick) (1.2.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\jatin\anaconda3\lib\site-packages (from scikit-learn>=1.0.0->yellowbrick) (2.2.0)
Requirement already satisfied: six>=1.5 in c:\users\jatin\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib!=3.0.0,>=2.0.2->yellowbrick) (1.16.0)
Downloading yellowbrick-1.5-py3-none-any.whl (282 kB)
   ---------------------------------------- 0.0/282.6 kB ? eta -:--:--
   ---------------------------------------- 0.0/282.6 kB ? eta -:--:--
   -- ------------------------------------ 20.5/282.6 kB 330.3 kB/s eta 0:00:01
   -------------------- ------------------- 143.4/282.6 kB 1.7 MB/s eta 0:00:01
   ---------------------------------------- 282.6/282.6 kB 2.5 MB/s eta 0:00:00
Installing collected packages: yellowbrick
Successfully installed yellowbrick-1.5
In [20]:
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer
from sklearn.metrics import silhouette_score
In [30]:
import os
os.environ['OMP_NUM_THREADS'] = '1'
In [31]:
plt.figure(figsize=(12, 8))
elbow_graph = KElbowVisualizer(KMeans(random_state=123), k=10)
elbow_graph.fit(df)
elbow_graph.show()
C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning:

KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.

C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning:

KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.

C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning:

KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.

C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning:

KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.

C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning:

KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.

C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning:

KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.

C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning:

KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.

C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning:

KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.

C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning:

KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.

Out[31]:
<Axes: title={'center': 'Distortion Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='distortion score'>
In [32]:
agrupador = KMeans(n_clusters = 3)
In [33]:
agrupador.fit(df)
labels = agrupador.labels_
print(labels)
C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning:

KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 0 2 0 0
 2 2 0 0 0 0 0 0 0 2 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
In [34]:
agrupador = KMeans(n_clusters = 3)
agrupador.fit(df)
labels = agrupador.labels_
labels
C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning:

KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.

Out[34]:
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 2,
       2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
In [35]:
from sklearn.metrics import silhouette_samples, silhouette_score
In [36]:
range_n_clusters = [i for i in range(2,10)]
print(range_n_clusters)
[2, 3, 4, 5, 6, 7, 8, 9]
In [37]:
from sklearn.cluster import KMeans

valores_silhueta = []
for k in range_n_clusters:
    agrupador = KMeans(n_clusters=k)
    labels = agrupador.fit_predict(df)
    media_silhueta = silhouette_score(df, labels)
    valores_silhueta.append(media_silhueta)
C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning:

KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.

C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning:

KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.

C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning:

KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.

C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning:

KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.

C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning:

KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.

C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning:

KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.

C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning:

KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.

C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning:

KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.

In [38]:
agrupador_kmeans = KMeans(n_clusters = 3)
labels_kmeans = agrupador_kmeans.fit_predict(df)
print("Labels K-means: ", labels_kmeans)
C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning:

KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.

Labels K-means:  [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 1 2 1 1
 2 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
In [39]:
print("The K-means silhouette coefficient is:", silhouette_score(df, labels_kmeans))
The K-means silhouette coefficient is: 0.38133235294176215
In [40]:
df["cluster"] = labels_kmeans
df.groupby("cluster").describe()
Out[40]:
AveragScore SafetySecurity ... Education NaturalEnvironment
count mean std min 25% 50% 75% max count mean ... 75% max count mean std min 25% 50% 75% max
cluster
0 41.0 76.649024 4.683675 68.24 72.99 77.31 80.31 84.55 41.0 86.537561 ... 85.19 91.44 41.0 67.135610 6.309125 53.20 63.23 68.58 71.71 78.74
1 57.0 43.884737 5.448067 30.40 41.87 44.67 47.87 53.68 57.0 52.389649 ... 44.52 61.10 57.0 51.321053 6.483988 33.67 48.46 52.69 56.10 62.22
2 69.0 58.715217 4.231192 47.71 55.91 58.56 61.27 66.88 69.0 68.047101 ... 69.19 81.87 69.0 53.809275 6.674120 40.27 48.56 53.92 58.64 69.35

3 rows × 104 columns

In [41]:
df.reset_index(level=0, inplace=True)
In [42]:
fig = px.choropleth(df,locations='Country', color = 'cluster', locationmode='country names',title = f'Cluster - Choropleth',color_continuous_scale='Rainbow')
fig.update_layout(margin={'r':0,'t':0,'l':0,'b':0}, coloraxis_colorbar=dict(
    title = 'Cluster',
    ticks = 'outside',
    tickvals = [0,1,2],
    dtick = 12))
fig.show()